Show code
library(tidyverse)
## get my transliteration table (I tried to guess the PsycInfo ASCII name from the PsycTESTS name)
translit <- readRDS("../sober_rubric/raw_data/psycinfo_psyctests_names.rds")
## get our first scrape (by journal, checking counts for each year in each journal for top tests)
psycinfo_scrape_by_journal <- read_tsv('../sober_rubric/raw_data/merged_table_all.tsv') %>%
drop_na(Name) %>%
# this tsv can be found in "Scraping-EBSCO-Host\data\merged tables"
# mutate(Name = toTitleCase(Name)) %>%
rename(usage_count = "Hit Count") %>%
group_by(Name, Year) %>%
summarise(usage_count = sum(usage_count))
## get our second scrape (by test DOI and year)
overview <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_clean_overview_table_1.tsv")
byyear <- readr::read_tsv("../sober_rubric/raw_data/20230617_ebsco_scrape_table_years_1.tsv")
byyear %>% group_by(DOI) %>% summarise(Hits = sum(Hits, na.rm=T)) %>% pull(Hits) %>% table()
.
0 1 2 3 4 5 6 7 8 9 10
27 13280 4107 2140 1487 1077 864 645 570 464 375
11 12 13 14 15 16 17 18 19 20 21
375 285 243 237 220 168 180 163 114 141 132
22 23 24 25 26 27 28 29 30 31 32
102 108 113 108 83 91 72 86 88 68 81
33 34 35 36 37 38 39 40 41 42 43
77 68 61 45 56 48 42 60 48 37 45
44 45 46 47 48 49 50 51 52 53 54
38 42 41 34 29 29 33 35 26 31 25
55 56 57 58 59 60 61 62 63 64 65
21 22 32 19 37 26 23 18 24 16 25
66 67 68 69 70 71 72 73 74 75 76
19 19 22 19 27 18 18 11 12 12 16
77 78 79 80 81 82 83 84 85 86 87
11 15 22 16 14 10 13 16 10 13 6
88 89 90 91 92 93 94 95 96 97 98
10 13 11 10 8 13 14 11 10 17 12
99 100 101 102 103 104 105 106 107 108 109
11 10 13 12 6 8 8 13 9 13 8
110 111 112 113 114 115 116 117 118 119 120
6 9 6 7 8 4 5 5 13 8 7
121 122 123 124 125 126 127 128 129 130 131
7 6 10 9 7 3 13 4 4 11 6
132 133 134 135 136 137 138 139 140 141 142
4 3 6 5 7 3 6 4 3 8 7
143 144 145 146 147 148 149 150 151 152 153
9 9 4 8 3 9 4 7 9 6 5
154 155 156 157 158 159 160 161 162 163 164
5 3 6 5 5 5 4 6 3 3 4
165 166 167 168 169 170 171 172 173 174 175
3 3 5 1 2 5 3 3 3 3 5
176 177 178 179 180 181 182 183 184 185 186
2 2 2 4 8 5 4 4 6 5 2
187 189 190 191 192 193 194 195 196 197 198
1 3 5 6 1 6 4 5 4 4 1
199 200 201 202 203 204 205 206 207 208 209
1 3 3 5 1 3 3 3 5 2 5
210 211 212 213 214 215 216 218 219 220 221
3 7 1 3 4 2 3 4 3 3 4
222 223 224 225 226 227 228 230 231 233 234
1 2 6 4 1 1 3 1 4 2 3
235 236 237 238 239 240 241 242 244 245 246
2 2 1 1 4 6 2 1 1 4 4
247 248 249 251 252 254 255 256 257 258 259
1 1 1 2 1 1 2 3 1 2 3
260 262 263 264 266 267 268 269 270 271 272
3 4 3 1 1 2 1 2 2 1 3
274 275 276 278 279 280 282 283 284 285 286
3 1 2 4 4 2 2 2 2 2 1
287 288 290 291 292 293 294 295 296 297 298
2 1 1 2 1 3 3 1 2 2 2
299 300 304 305 307 308 309 311 312 313 314
3 1 1 1 1 4 1 1 1 1 1
315 316 318 319 320 322 324 325 326 327 329
3 2 1 3 4 2 1 2 1 1 2
330 331 332 333 334 337 338 339 341 342 346
1 2 4 1 1 1 1 1 1 1 2
347 348 349 353 358 359 361 363 364 367 368
1 1 1 1 3 2 2 1 2 1 1
371 372 376 377 379 380 384 387 389 392 393
2 1 1 2 1 1 2 2 1 1 1
394 396 397 398 400 401 405 407 408 411 414
1 1 2 1 2 1 2 2 1 1 1
415 418 419 423 424 428 429 430 431 436 437
1 1 1 1 1 1 1 1 2 1 1
438 441 443 445 446 451 452 456 460 462 464
2 3 2 1 2 1 1 1 1 1 1
466 470 483 485 486 488 491 495 499 500 504
2 1 1 1 1 1 1 1 1 3 1
512 518 519 520 528 529 532 534 535 537 538
1 1 1 1 2 1 1 1 1 1 1
539 540 542 544 545 546 550 553 554 556 561
1 1 1 2 1 1 1 1 1 1 1
562 568 569 570 574 577 584 585 589 595 597
1 1 1 1 2 1 1 1 1 1 1
598 600 601 603 604 623 626 627 631 632 633
1 1 1 1 1 1 1 1 2 1 1
639 642 656 658 660 661 662 669 671 675 677
1 2 1 1 1 1 1 1 1 1 1
678 679 682 686 688 696 698 700 709 710 712
1 1 1 1 1 1 1 1 1 1 1
714 716 718 720 722 724 725 727 728 730 732
1 2 2 1 1 1 1 1 2 1 1
733 755 761 762 764 772 773 780 783 794 796
1 1 1 1 1 1 1 1 1 1 2
800 808 812 813 816 819 825 840 844 845 847
1 2 1 1 2 1 1 1 1 1 2
848 849 856 862 871 886 891 908 911 915 919
1 1 1 1 1 1 1 2 1 1 1
928 933 934 935 950 959 969 973 974 981 988
1 2 1 2 1 1 2 2 1 1 1
992 993 1009 1015 1018 1043 1071 1074 1077 1119 1121
1 1 1 1 1 1 1 1 1 1 1
1131 1135 1161 1163 1172 1173 1181 1184 1219 1224 1247
1 1 1 1 1 1 1 1 1 1 1
1251 1253 1255 1267 1296 1300 1323 1340 1378 1380 1392
1 1 1 1 1 1 1 1 1 1 1
1395 1399 1402 1429 1470 1479 1487 1519 1521 1553 1562
1 1 1 1 1 1 1 2 1 1 1
1569 1579 1642 1648 1688 1748 1772 1825 1868 1901 1932
1 1 1 1 1 1 1 1 1 1 1
1937 2052 2065 2074 2102 2121 2130 2132 2149 2200 2254
1 1 1 1 1 1 1 1 1 1 1
2304 2352 2584 2678 2700 2847 3053 3067 3134 3157 3487
1 1 1 1 1 1 1 1 1 1 1
3500 3637 3675 3750 3790 4041 4096 4410 4484 4876 4888
1 1 1 1 1 1 1 1 1 1 1
5147 6257 6313 6365 6408 6494 7023 7095 7238 7504 7597
1 1 1 1 1 1 1 1 1 1 1
8420 8513 8709 9492 10896 12134 13316 14268 18484 25118
1 1 1 1 1 1 1 1 1 1
Show code
# A tibble: 13,280 × 4
DOI first_pub_year Year Hits
<chr> <dbl> <dbl> <dbl>
1 10.1037/t00002-000 2014 2014 1
2 10.1037/t00046-000 2009 2009 1
3 10.1037/t00053-000 2013 2013 1
4 10.1037/t00077-000 2015 2015 1
5 10.1037/t00113-000 2012 2012 1
6 10.1037/t00119-000 2015 2015 1
7 10.1037/t00127-000 1949 1949 1
8 10.1037/t00128-000 2014 2014 1
9 10.1037/t00131-000 1961 1961 1
10 10.1037/t00193-000 2016 2016 1
# ℹ 13,270 more rows
Show code
byyear <- byyear %>% anti_join(one_hit_wonders, by = "DOI")
psycinfo_by_doi <- one_hit_wonders %>%
select(DOI, Year, Hits) %>%
bind_rows(byyear) %>%
left_join(overview %>% rename(total_hits = Hits), by = "DOI")
## don't use tests with names that occur many times
dupe_names <- translit %>% group_by(name_psycinfo) %>% filter(n() > 1) %>% ungroup()
translit <- translit %>% group_by(name_psycinfo) %>%
mutate(non_unique_name = n() > 1) %>%
filter(row_number() == 1) %>% ungroup()
# merge it all
psycinfo <- psycinfo_by_doi %>%
full_join(translit %>% select(DOI, name_psycinfo, NameOC), by = "DOI") %>%
full_join(psycinfo_scrape_by_journal, by = c("name_psycinfo" = "Name", "Year")) %>%
rename(hits_scrape_1 = usage_count,
hits_scrape_2 = Hits,
total_hits_scrape_2 = total_hits) %>%
group_by(name_psycinfo) %>%
mutate(total_hits_scrape_1 = sum(hits_scrape_1))
psycinfo %>% is.na() %>% colSums()
DOI Year hits_scrape_2
96747 39022 135768
first_pub_year last_pub_year total_hits_scrape_2
135768 135768 135768
name_psycinfo NameOC hits_scrape_1
3079 99825 218121
total_hits_scrape_1
265989
Show code
## aggregate it all
psycinfo_overall <- psycinfo %>%
group_by(name_psycinfo) %>%
summarise(total_hits_scrape_1 = sum(hits_scrape_1, na.rm = T),
total_hits_scrape_2 = sum(hits_scrape_2, na.rm = T)) %>%
left_join(translit %>% select(DOI, name_psycinfo))
## correlate totals
cor.test(psycinfo_overall$total_hits_scrape_1, psycinfo_overall$total_hits_scrape_2)
Pearson's product-moment correlation
data: psycinfo_overall$total_hits_scrape_1 and psycinfo_overall$total_hits_scrape_2
t = 249.62, df = 104320, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6076982 0.6152964
sample estimates:
cor
0.6115114
Show code
psycinfo_overall %>%
filter(total_hits_scrape_1 > 0, total_hits_scrape_2 > 0) %>%
summarise(cor(total_hits_scrape_1, total_hits_scrape_2))
# A tibble: 1 × 1
`cor(total_hits_scrape_1, total_hits_scrape_2)`
<dbl>
1 0.904
Show code
## correlate by year, diffs, proportions
cor.test(psycinfo$hits_scrape_1, psycinfo$hits_scrape_2)
Pearson's product-moment correlation
data: psycinfo$hits_scrape_1 and psycinfo$hits_scrape_2
t = 467.52, df = 39014, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9196473 0.9226533
sample estimates:
cor
0.921164
Show code
[1] 12.3914
Show code
Show code
[1] 11.99798
Show code
# psycinfo %>% filter(hits_scrape_1 > hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()
psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% nrow()
[1] 27545
Show code
.
-165 -143 -99 -98 -84 -81 -73 -50 -43 -41 -39 -35 -31 -27
1 1 1 1 1 1 1 1 1 1 1 1 1 1
-23 -21 -19 -15 -13 140 143 154 179 182 186 190 195 206
1 1 1 1 1 1 1 1 1 1 1 1 1 1
216 226 228 233 239 241 243 246 248 250 253 257 258 260
1 1 1 1 1 1 1 1 1 1 1 1 1 1
262 263 265 268 269 274 278 281 284 285 287 290 293 294
1 1 1 1 1 1 1 1 1 1 1 1 1 1
298 301 302 307 311 312 313 316 319 325 326 327 328 332
1 1 1 1 1 1 1 1 1 1 1 1 1 1
334 335 337 340 344 347 350 351 355 358 359 365 376 379
1 1 1 1 1 1 1 1 1 1 1 1 1 1
381 383 394 396 398 400 404 406 410 413 414 416 417 418
1 1 1 1 1 1 1 1 1 1 1 1 1 1
421 428 429 430 432 433 434 437 439 441 443 446 449 460
1 1 1 1 1 1 1 1 1 1 1 1 1 1
462 466 474 490 493 495 496 502 510 511 512 516 526 531
1 1 1 1 1 1 1 1 1 1 1 1 1 1
539 553 562 563 567 571 577 586 590 602 604 613 633 639
1 1 1 1 1 1 1 1 1 1 1 1 1 1
640 644 655 659 661 683 691 700 701 704 714 736 765 771
1 1 1 1 1 1 1 1 1 1 1 1 1 1
775 791 804 806 828 854 858 865 879 919 950 955 965 966
1 1 1 1 1 1 1 1 1 1 1 1 1 1
976 1004 1005 1265 1335 1591 -96 -17 -16 -12 105 119 135 136
1 1 1 1 1 1 2 2 2 2 2 2 2 2
141 153 157 159 160 162 164 165 167 169 172 173 174 180
2 2 2 2 2 2 2 2 2 2 2 2 2 2
183 191 197 198 200 207 211 217 218 225 232 235 236 238
2 2 2 2 2 2 2 2 2 2 2 2 2 2
244 256 261 267 270 272 273 282 288 295 304 305 306 317
2 2 2 2 2 2 2 2 2 2 2 2 2 2
318 322 339 342 346 349 352 369 373 375 380 385 392 407
2 2 2 2 2 2 2 2 2 2 2 2 2 2
408 431 436 438 440 450 456 548 680 -11 -9 99 118 138
2 2 2 2 2 2 2 2 2 3 3 3 3 3
146 158 171 177 178 185 189 192 196 199 202 204 205 208
3 3 3 3 3 3 3 3 3 3 3 3 3 3
215 219 220 222 223 234 247 254 264 275 279 286 297 303
3 3 3 3 3 3 3 3 3 3 3 3 3 3
309 329 336 356 367 374 382 537 -14 -10 132 134 142 144
3 3 3 3 3 3 3 3 4 4 4 4 4 4
145 148 150 163 170 176 187 188 193 194 224 255 366 -8
4 4 4 4 4 4 4 4 4 4 4 4 4 5
97 126 129 139 149 152 155 156 161 166 168 175 181 184
5 5 5 5 5 5 5 5 5 5 5 5 5 5
209 229 231 107 109 111 116 125 130 151 103 104 113 117
5 5 5 6 6 6 6 6 6 6 7 7 7 7
120 121 127 133 137 147 89 115 123 -7 110 112 114 124
7 7 7 7 7 7 8 8 8 9 9 9 9 9
131 86 90 92 100 101 108 122 128 74 87 88 95 96
9 10 10 10 10 10 10 10 10 11 11 11 11 11
102 91 93 94 106 77 85 98 63 82 72 80 81 83
11 12 12 12 12 13 14 14 15 15 16 16 16 16
84 76 70 73 79 -6 65 64 69 75 78 60 68 71
16 17 18 18 19 20 20 21 21 22 23 24 24 24
66 67 62 -5 59 61 55 57 58 50 53 56 46 52
26 28 29 30 31 32 33 33 34 35 35 39 43 45
54 43 40 48 51 49 45 42 44 41 47 39 38 -4
45 46 47 47 48 50 52 55 58 59 61 62 69 70
37 34 36 35 33 32 30 29 31 27 28 26 25 23
76 77 77 85 96 101 102 107 108 112 123 142 157 163
-3 24 22 21 20 19 18 17 16 15 14 13 12 11
165 166 183 200 225 248 268 285 311 357 383 431 544 591
-2 10 9 8 7 6 5 4 3 2 -1 1 0
615 688 765 933 1066 1228 1589 2033 2638 3487 3759 4918 6757
Show code
# psycinfo %>% filter(hits_scrape_1 < hits_scrape_2) %>% select(DOI, Year, name_psycinfo, NameOC, hits_scrape_1, hits_scrape_2) %>% mutate(diff = hits_scrape_2 - hits_scrape_1) %>% arrange(diff) %>% View()
Top Tests in each
Only in PsycInfo Scrape 1
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 35864 254316 7.09
Show code
options(cols.min.print = 2, cols.print = 2)
Only in PsycTests Scrape 2
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_2)` `sum(total_hits_scrape_2)/n()`
<int> <dbl> <dbl>
1 22052 57411 2.60
Show code
psycinfo_overall %>%
ungroup() %>%
filter(total_hits_scrape_1 == 0, total_hits_scrape_2 >= 1) %>%
# filter(!is.na(DOI), is.na(total_hits_scrape_1) | total_hits_scrape_1 == 0) %>%
drop_na(name_psycinfo, total_hits_scrape_2) %>%
arrange(desc(total_hits_scrape_2)) %>%
select( name_psycinfo, total_hits_scrape_2) %>%
DT::datatable()
Hits only in scrape 1, even though we have a match for the name
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 1453 22239 15.3
Hits only in scrape 1 without a clear match for the name
Show code
# A tibble: 1 × 3
`n()` `sum(total_hits_scrape_1)` `sum(total_hits_scrape_1)/n()`
<int> <dbl> <dbl>
1 34411 232077 6.74
Merge Scrape 1 and 2
Show code
psycinfo_scrape_1_without_hits_in_2 <- psycinfo_overall %>%
ungroup() %>%
filter(total_hits_scrape_1 > 0, is.na(total_hits_scrape_2) | total_hits_scrape_2 == 0) %>%
select(DOI, name_psycinfo) %>%
distinct(name_psycinfo, .keep_all = TRUE) %>%
left_join(psycinfo_scrape_by_journal %>%
rename(name_psycinfo = Name, Hits = usage_count), by = "name_psycinfo", multiple = "all") %>%
mutate(DOI = coalesce(DOI, name_psycinfo)) %>%
group_by(DOI) %>%
mutate(first_pub_year = min(Year, na.rm = T),
last_pub_year = max(Year, na.rm = T),
total_hits = sum(Hits, na.rm = T)) %>%
ungroup()
psycinfo_scrape_1_without_hits_in_2 %>%
summarise(n_distinct(DOI), sum(Hits), sum(Hits)/n_distinct(DOI))
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits)` `sum(Hits)/n_distinct(DOI)`
<int> <dbl> <dbl>
1 35864 254316 7.09
Show code
[1] 3078
Show code
sum(!is.na(psycinfo_by_doi_with_hits$name_psycinfo))
[1] 215037
Show code
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
<int> <dbl> <dbl>
1 31118 876009 28.2
# ℹ abbreviated name: ¹`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
psycinfo_merged <- bind_rows(
scrape_2 = psycinfo_by_doi_with_hits,
scrape_1 = psycinfo_scrape_1_without_hits_in_2, .id = "source")
psycinfo_merged %>%
summarise(n_distinct(DOI), sum(Hits, na.rm = T), sum(Hits, na.rm = T)/n_distinct(DOI))
# A tibble: 1 × 3
`n_distinct(DOI)` `sum(Hits, na.rm = T)` sum(Hits, na.rm = T)/n_di…¹
<int> <dbl> <dbl>
1 66982 1130325 16.9
# ℹ abbreviated name: ¹`sum(Hits, na.rm = T)/n_distinct(DOI)`
Show code
saveRDS(psycinfo_merged, "../sober_rubric/raw_data/psycinfo_merged_scrape_1_and_2.rds")
Joint top list